In [1]:
import random
from datetime import timedelta

import numpy as np
import pandas as pd

import matplotlib.pyplot as ply
import seaborn as sns
import plotly.graph_objs as go
import plotly.express as px
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
import calmap
import folium

cnf, dth, rec, act = '#393e46', '#ff2e63', '#21bf73', '#fe9801'
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()
import warnings
warnings.filterwarnings('ignore')
In [2]:
from plotly.offline import plot, iplot, init_notebook_mode
init_notebook_mode(connected=True)
In [3]:
full_table = pd.read_csv('C:/Users/rsingh/usa_county_wise.csv', 
                         parse_dates=['Date'])
full_table.sample(6)
Out[3]:
UID iso2 iso3 code3 FIPS Admin2 Province_State Country_Region Lat Long_ Combined_Key Date Confirmed Deaths
599746 84035017 NM USA 840 35017.0 Grant New Mexico US 32.739113 -108.382784 Grant, New Mexico, US 2020-07-19 49 0
384889 84018021 IN USA 840 18021.0 Clay Indiana US 39.395618 -87.117235 Clay, Indiana, US 2020-05-16 29 1
406116 84037009 NC USA 840 37009.0 Ashe North Carolina US 36.432962 -81.498627 Ashe, North Carolina, US 2020-05-22 32 0
242066 84029043 MO USA 840 29043.0 Christian Missouri US 36.971178 -93.185371 Christian, Missouri, US 2020-04-03 12 0
86229 84048249 TX USA 840 48249.0 Jim Wells Texas US 27.731039 -98.090470 Jim Wells, Texas, US 2020-02-16 0 0
276835 84051109 VA USA 840 51109.0 Louisa Virginia US 37.978503 -77.962748 Louisa, Virginia, US 2020-04-13 27 0

Preprocessing

In [4]:
# Active Case = confirmed - deaths
full_table['Active'] = full_table['Confirmed'] - full_table['Deaths']


# filling missing values 
full_table[['Province_State']] = full_table[['Province_State']].fillna('')
full_table[['Confirmed', 'Deaths','Active']] = full_table[['Confirmed', 'Deaths', 'Active']].fillna(0)

full_table.sample(6)
Out[4]:
UID iso2 iso3 code3 FIPS Admin2 Province_State Country_Region Lat Long_ Combined_Key Date Confirmed Deaths Active
383187 84045059 SC USA 840 45059.0 Laurens South Carolina US 34.482444 -82.004908 Laurens, South Carolina, US 2020-05-15 57 3 54
511412 84009005 CT USA 840 9005.0 Litchfield Connecticut US 41.793753 -73.245148 Litchfield, Connecticut, US 2020-06-23 1469 136 1333
196669 84051097 VA USA 840 51097.0 King and Queen Virginia US 37.710449 -76.886896 King and Queen, Virginia, US 2020-03-20 0 0 0
560525 84048281 TX USA 840 48281.0 Lampasas Texas US 31.195313 -98.242066 Lampasas, Texas, US 2020-07-07 20 0 20
469072 84027153 MN USA 840 27153.0 Todd Minnesota US 46.069962 -94.897853 Todd, Minnesota, US 2020-06-10 364 2 362
134713 84021077 KY USA 840 21077.0 Gallatin Kentucky US 38.753987 -84.855838 Gallatin, Kentucky, US 2020-03-02 0 0 0
In [5]:
# Grouped by day, State
# =======================

full_grouped = full_table.groupby(['Date', 'Province_State'])['Confirmed', 'Deaths', 'Active'].sum().reset_index()

# new cases ======================================================
temp = full_grouped.groupby(['Province_State', 'Date', ])['Confirmed', 'Deaths']
temp = temp.sum().diff().reset_index()

mask = temp['Province_State'] != temp['Province_State'].shift(1)

temp.loc[mask, 'Confirmed'] = np.nan
temp.loc[mask, 'Deaths'] = np.nan


# renaming columns
temp.columns = ['Province_State', 'Date', 'New cases', 'New deaths']
# =================================================================

# merging new values
full_grouped = pd.merge(full_grouped, temp, on=['Province_State', 'Date'])

# filling na with 0
full_grouped = full_grouped.fillna(0)

# fixing data types
cols = ['New cases', 'New deaths']
full_grouped[cols] = full_grouped[cols].astype('int')

full_grouped['New cases'] = full_grouped['New cases'].apply(lambda x: 0 if x<0 else x)

full_grouped.head()
Out[5]:
Date Province_State Confirmed Deaths Active New cases New deaths
0 2020-01-22 Alabama 0 0 0 0 0
1 2020-01-22 Alaska 0 0 0 0 0
2 2020-01-22 American Samoa 0 0 0 0 0
3 2020-01-22 Arizona 0 0 0 0 0
4 2020-01-22 Arkansas 0 0 0 0 0
In [6]:
# Day wise
# ========

# table
day_wise = full_grouped.groupby('Date')['Confirmed', 'Deaths','Active', 'New cases'].sum().reset_index()

# number cases per 100 cases
day_wise['Deaths / 100 Cases'] = round((day_wise['Deaths']/day_wise['Confirmed'])*100, 2)

# no. of states
day_wise['No. of States'] = full_grouped[full_grouped['Confirmed']!=0].groupby('Date')['Province_State'].unique().apply(len).values

# fillna by 0
cols = ['Deaths / 100 Cases']
day_wise[cols] = day_wise[cols].fillna(0)

day_wise.head()
Out[6]:
Date Confirmed Deaths Active New cases Deaths / 100 Cases No. of States
0 2020-01-22 1 0 1 0 0.0 1
1 2020-01-23 1 0 1 0 0.0 1
2 2020-01-24 2 0 2 1 0.0 2
3 2020-01-25 2 0 2 0 0.0 2
4 2020-01-26 5 0 5 3 0.0 4
In [7]:
# State wise
# ============

# getting latest values
state_wise = full_grouped[full_grouped['Date']==max(full_grouped['Date'])].reset_index(drop=True).drop('Date', axis=1)

# group by state
state_wise = state_wise.groupby('Province_State')['Confirmed', 'Deaths', 'Active', 'New cases'].sum().reset_index()

# per 100 cases
state_wise['Deaths / 100 Cases'] = round((state_wise['Deaths']/state_wise['Confirmed'])*100, 2)

cols = ['Deaths / 100 Cases']
state_wise[cols] = state_wise[cols].fillna(0)

state_wise_sort = state_wise.sort_values('Confirmed', ascending =False)

state_wise_sort.head()
Out[7]:
Province_State Confirmed Deaths Active New cases Deaths / 100 Cases
5 California 458121 8494 449627 5833 1.85
11 Florida 432747 5941 426806 8892 1.37
36 New York 412344 32645 379699 608 7.92
49 Texas 400336 5633 394703 6252 1.41
34 New Jersey 179812 15804 164008 449 8.79
In [8]:
# load population dataset
pop = pd.read_csv('C:/Users/rsingh/usa_population_by_state_2020.csv')

# select only population
pop = pop.iloc[:, :2]

# rename column names
pop.columns = ['Province_State', 'Population']

# merged data
state_wise = pd.merge(state_wise, pop, on='Province_State', how='left')

# missing values
# country_wise.isna().sum()
# country_wise[country_wise['Population'].isna()]['Country/Region'].tolist()

# Cases per population
state_wise['Cases / 1000 People'] = round((state_wise['Confirmed'] / state_wise['Population']) * 1000)

state_wise.head()
Out[8]:
Province_State Confirmed Deaths Active New cases Deaths / 100 Cases Population Cases / 1000 People
0 Alabama 81115 1447 79668 1821 1.78 4903185 17.0
1 Alaska 2623 67 2556 98 2.55 731545 4.0
2 American Samoa 0 0 0 0 0.00 55641 0.0
3 Arizona 163827 3349 160478 1813 2.04 7278717 23.0
4 Arkansas 39447 408 39039 824 1.03 3017804 13.0
In [9]:
today = full_grouped[full_grouped['Date']==max(full_grouped['Date'])].reset_index(drop=True).drop('Date', axis=1)[['Province_State', 'Confirmed']]
last_week = full_grouped[full_grouped['Date']==max(full_grouped['Date'])-timedelta(days=7)].reset_index(drop=True).drop('Date', axis=1)[['Province_State', 'Confirmed']]

temp = pd.merge(today, last_week, on='Province_State', suffixes=(' today', ' last week'))

temp['1 week change'] = temp['Confirmed today'] - temp['Confirmed last week']

temp = temp[['Province_State', 'Confirmed last week', '1 week change']]

state_wise = pd.merge(state_wise, temp, on='Province_State')

state_wise['1 week % increase'] = round(state_wise['1 week change']/state_wise['Confirmed last week']*100, 2)

state_wise.sort_values('Deaths')

state_wise.head()
Out[9]:
Province_State Confirmed Deaths Active New cases Deaths / 100 Cases Population Cases / 1000 People Confirmed last week 1 week change 1 week % increase
0 Alabama 81115 1447 79668 1821 1.78 4903185 17.0 68891 12224 17.74
1 Alaska 2623 67 2556 98 2.55 731545 4.0 1950 673 34.51
2 American Samoa 0 0 0 0 0.00 55641 0.0 0 0 NaN
3 Arizona 163827 3349 160478 1813 2.04 7278717 23.0 145183 18644 12.84
4 Arkansas 39447 408 39039 824 1.03 3017804 13.0 33927 5520 16.27
In [10]:
temp = full_table.groupby('Date')['Confirmed', 'Deaths',  'Active'].sum().reset_index()
temp = temp[temp['Date']==max(temp['Date'])].reset_index(drop=True)

tm = temp.melt(id_vars="Date", value_vars=['Active', 'Deaths'])
fig = px.treemap(tm, path=["variable"], values="value", height=300, width=1000,
                 color_discrete_sequence=[act, rec, dth])
fig.data[0].textinfo = 'label+text+value'
fig.show()
In [11]:
temp = full_table.groupby('Date')['Deaths', 'Active'].sum().reset_index()
temp = temp.melt(id_vars="Date", value_vars=['Deaths', 'Active'],
                 var_name='Case', value_name='Count')
temp.head()

fig = px.area(temp, x="Date", y="Count", color='Case', height=400,
             title='Cases in USA over time', color_discrete_sequence = [rec, dth, act])
fig.update_layout(xaxis_rangeslider_visible=True)
fig.show()

On the Map (COVID-19)

In [12]:
fig = px.choropleth(full_table, locations="iso2", locationmode='USA-states', color=np.log(full_table["Confirmed"]), 
                    hover_name="iso2", animation_frame=full_table["Date"].dt.strftime('%Y-%m-%d'),
                    title='Cases over time', color_continuous_scale=px.colors.sequential.Magenta)
fig.update(layout_coloraxis_showscale=False)
fig.show()
In [13]:
full_latest = full_table[full_table['Date'] == max(full_table['Date'])]
                         
fig = px.treemap(full_latest.sort_values(by='Confirmed', ascending=False).reset_index(drop=True), 
                 path=["Province_State"], values="Confirmed", height=700,
                 title='Confirmed Cases by State',
                 color_discrete_sequence = px.colors.qualitative.Dark2)
fig.data[0].textinfo = 'label+text+value'
fig.show()

fig = px.treemap(full_latest.sort_values(by='Deaths', ascending=False).reset_index(drop=True), 
                 path=[ "Province_State"], values="Deaths", height=700,
                 title='Deaths reported by State',
                 color_discrete_sequence = px.colors.qualitative.Dark2)
fig.data[0].textinfo = 'label+text+value'
fig.show()
In [ ]: